import pandas as pd
import numpy as np

c = 1000

df = pd.read_csv('../data/gpo_final_data/narratives_complete_with_metadata_manual_labels_rich_{0}.csv'.format(c))
df['entity'] = df['ARGO']

df1 = pd.read_csv('../data/gpo_final_data/narratives_complete_with_metadata_manual_labels_rich_{0}.csv'.format(c))
df1['entity'] = df1['ARG1']

df = df.append(df1)

# Drop irrelevant observations
df = df[(df.party == 'Republican')|(df.party == 'Democrat')]
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(subset=['log_or'],inplace=True)

# Get Logs Odds for Entities
odds_ratio = df[['entity','party']]
odds_ratio['rep'] = 0
odds_ratio['rep'] = np.where(odds_ratio['party'] == 'Republican', 1, odds_ratio['rep'])
odds_ratio['rep'] = odds_ratio['rep'].groupby(odds_ratio['entity']).transform('sum')
odds_ratio['n'] = 1
odds_ratio['frequency'] = odds_ratio['n'].groupby(odds_ratio['entity']).transform('sum')
odds_ratio['dem'] = odds_ratio['frequency'] - odds_ratio['rep']
rep_narratives = odds_ratio.rep.sum()
dem_narratives = odds_ratio.dem.sum()
odds_ratio['entity_or'] = (odds_ratio['rep']/odds_ratio['dem'])/((rep_narratives-odds_ratio['rep'])/(dem_narratives-odds_ratio['dem']))
odds_ratio['log_entity_or'] =  np.log(odds_ratio['entity_or'])
odds_ratio = odds_ratio.drop_duplicates(subset=['entity'])[['entity', 'log_entity_or']]
df = df.merge(odds_ratio, on='entity', how='left', indicator=True)
df.drop(columns=['_merge'], inplace=True)

df = df[['narrative', 'entity', 'log_or', 'log_entity_or']]

# Define which narratives are mainly Republican or Democrat (as measured by an odds ratio)
df['party'] = 'Republican'
df['party'][df['log_or'] < 0] = 'Democrat'

temp = df.groupby(['entity', 'party'])['narrative'].nunique().reset_index().pivot(index=['entity'], columns = 'party',values = 'narrative').reset_index()
temp = temp.fillna(0)
temp['Total'] = temp['Republican'] + temp['Democrat']

# Compute Entity Divisiveness Score
df = df.drop_duplicates()
df['log_or'] = np.abs(df.log_or)
df['log_entity_or'] = np.abs(df.log_entity_or)
df['diff'] = df.log_or - df.log_entity_or
df['log_or'] = df['log_or'].groupby(df['entity']).transform('mean')
df['diff'] = df['diff'].groupby(df['entity']).transform('mean')

df = df[['entity', 'diff', 'log_or', 'log_entity_or']].drop_duplicates()

# Re-arrange the data and output table.
df = df.merge(temp, on = 'entity', how = 'left')
df = df[['entity', 'diff', 'log_or', 'log_entity_or', 'Total', 'Democrat', 'Republican']]
df.columns = ['Entity', 'Score', '|Log Odds Narratives|', '|Log Odds Entities|', 'Total', 'Democrat', 'Republican']
df = df.sort_values(by='Score', ascending = False)

df.to_csv('../tables/Tables_2_and_H_1_2.csv'.format(c), index = False)
